\chapter{Mathematical Foundations}
\label{app:mathematical_foundations}

% Set section numbering for Appendix C
\renewcommand{\thesection}{C.\arabic{section}}
\renewcommand{\thesubsection}{C.\arabic{section}.\arabic{subsection}}
\renewcommand{\thesubsubsection}{C.\arabic{section}.\arabic{subsection}.\arabic{subsubsection}}

This appendix provides the essential mathematical background required to understand the theoretical developments throughout this book. We cover probability theory, linear algebra, convex analysis, information theory, and graph theory fundamentals with particular emphasis on concepts relevant to AI-powered code intelligence systems.

\section{Probability Theory Basics}

\subsection{Probability Spaces and Measure Theory}

\begin{definition}[Probability Space]
\label{def:probability_space}
A probability space is a triple $(\Omega, \mathcal{F}, \Prob)$ where:
\begin{itemize}
    \item $\Omega$ is the sample space (set of all possible outcomes)
    \item $\mathcal{F}$ is a $\sigma$-algebra on $\Omega$ (collection of measurable events)
    \item $\Prob: \mathcal{F} \rightarrow [0,1]$ is a probability measure satisfying:
    \begin{enumerate}
        \item $\Prob(\Omega) = 1$ (normalization)
        \item $\Prob(\emptyset) = 0$ (empty set has zero probability)
        \item For disjoint events $A_1, A_2, \ldots$: $\Prob(\bigcup_{i=1}^{\infty} A_i) = \sum_{i=1}^{\infty} \Prob(A_i)$ (countable additivity)
    \end{enumerate}
\end{itemize}
\end{definition}

\begin{definition}[Conditional Probability]
\label{def:conditional_probability}
For events $A, B \in \mathcal{F}$ with $\Prob(B) > 0$:
\begin{equation}
\Prob(A | B) = \frac{\Prob(A \cap B)}{\Prob(B)}
\end{equation}
\end{definition}

\begin{theorem}[Bayes' Theorem]
\label{thm:bayes_theorem}
For events $A_1, \ldots, A_n$ forming a partition of $\Omega$ and event $B$ with $\Prob(B) > 0$:
\begin{equation}
\Prob(A_i | B) = \frac{\Prob(B | A_i) \Prob(A_i)}{\sum_{j=1}^n \Prob(B | A_j) \Prob(A_j)}
\end{equation}
\end{theorem}

\subsection{Random Variables and Distributions}

\begin{definition}[Random Variable]
\label{def:random_variable}
A random variable $X$ on probability space $(\Omega, \mathcal{F}, \Prob)$ is a measurable function $X: \Omega \rightarrow \R$ such that for all Borel sets $B \subseteq \R$:
\begin{equation}
X^{-1}(B) = \{\omega \in \Omega : X(\omega) \in B\} \in \mathcal{F}
\end{equation}
\end{definition}

\begin{definition}[Expectation]
\label{def:expectation}
For a random variable $X$ with distribution $\mu$:
\begin{equation}
\E[X] = \int_{\R} x \, d\mu(x)
\end{equation}
For discrete random variables: $\E[X] = \sum_{x} x \Prob(X = x)$
\end{definition}

\begin{definition}[Variance and Covariance]
\label{def:variance_covariance}
For random variables $X, Y$:
\begin{align}
\Var(X) &= \E[(X - \E[X])^2] = \E[X^2] - (\E[X])^2 \\
\Cov(X, Y) &= \E[(X - \E[X])(Y - \E[Y])] = \E[XY] - \E[X]\E[Y]
\end{align}
\end{definition}

\begin{theorem}[Law of Large Numbers]
\label{thm:lln}
Let $X_1, X_2, \ldots$ be independent, identically distributed random variables with finite expectation $\mu$. Then:
\begin{equation}
\lim_{n \rightarrow \infty} \frac{1}{n} \sum_{i=1}^n X_i = \mu \quad \text{almost surely}
\end{equation}
\end{theorem}

\begin{theorem}[Central Limit Theorem]
\label{thm:clt}
Let $X_1, X_2, \ldots$ be i.i.d. random variables with mean $\mu$ and finite variance $\sigma^2 > 0$. Then:
\begin{equation}
\lim_{n \rightarrow \infty} \Prob\left(\frac{\sum_{i=1}^n X_i - n\mu}{\sigma\sqrt{n}} \leq z\right) = \Phi(z)
\end{equation}
where $\Phi$ is the standard normal cumulative distribution function.
\end{theorem}

\subsection{Concentration Inequalities}

\begin{theorem}[Hoeffding's Inequality]
\label{thm:hoeffding}
Let $X_1, \ldots, X_n$ be independent random variables with $X_i \in [a_i, b_i]$ almost surely. Then for any $t > 0$:
\begin{equation}
\Prob\left(\left|\frac{1}{n}\sum_{i=1}^n X_i - \frac{1}{n}\sum_{i=1}^n \E[X_i]\right| \geq t\right) \leq 2\exp\left(-\frac{2n^2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right)
\end{equation}
\end{theorem}

\section{Linear Algebra}

\subsection{Vector Spaces and Linear Maps}

\begin{definition}[Vector Space]
\label{def:vector_space}
A vector space $V$ over field $\mathbb{F}$ is a set equipped with operations $+: V \times V \rightarrow V$ and $\cdot: \mathbb{F} \times V \rightarrow V$ satisfying:
\begin{itemize}
    \item Associativity: $(\mathbf{u} + \mathbf{v}) + \mathbf{w} = \mathbf{u} + (\mathbf{v} + \mathbf{w})$
    \item Commutativity: $\mathbf{u} + \mathbf{v} = \mathbf{v} + \mathbf{u}$
    \item Identity: $\exists \mathbf{0} \in V$ such that $\mathbf{v} + \mathbf{0} = \mathbf{v}$ for all $\mathbf{v} \in V$
    \item Inverse: For each $\mathbf{v} \in V$, $\exists -\mathbf{v} \in V$ such that $\mathbf{v} + (-\mathbf{v}) = \mathbf{0}$
    \item Scalar multiplication properties (distributivity, associativity, unity)
\end{itemize}
\end{definition}

\begin{definition}[Inner Product]
\label{def:inner_product}
An inner product on real vector space $V$ is a function $\langle \cdot, \cdot \rangle: V \times V \rightarrow \R$ satisfying:
\begin{itemize}
    \item Symmetry: $\langle \mathbf{u}, \mathbf{v} \rangle = \langle \mathbf{v}, \mathbf{u} \rangle$
    \item Linearity: $\langle a\mathbf{u} + b\mathbf{v}, \mathbf{w} \rangle = a\langle \mathbf{u}, \mathbf{w} \rangle + b\langle \mathbf{v}, \mathbf{w} \rangle$
    \item Positive definiteness: $\langle \mathbf{v}, \mathbf{v} \rangle \geq 0$ with equality iff $\mathbf{v} = \mathbf{0}$
\end{itemize}
\end{definition}

\subsection{Matrix Theory}

\begin{definition}[Matrix Norms]
\label{def:matrix_norms}
For matrix $A \in \R^{m \times n}$:
\begin{align}
\|A\|_F &= \sqrt{\sum_{i,j} A_{ij}^2} \quad \text{(Frobenius norm)} \\
\|A\|_2 &= \sigma_{\max}(A) \quad \text{(spectral norm)} \\
\|A\|_* &= \sum_i \sigma_i(A) \quad \text{(nuclear norm)}
\end{align}
where $\sigma_i(A)$ are the singular values of $A$.
\end{definition}

\begin{theorem}[Singular Value Decomposition]
\label{thm:svd}
For any matrix $A \in \R^{m \times n}$, there exist orthogonal matrices $U \in \R^{m \times m}$ and $V \in \R^{n \times n}$ and diagonal matrix $\Sigma \in \R^{m \times n}$ with non-negative entries such that:
\begin{equation}
A = U\Sigma V^T
\end{equation}
The diagonal entries $\sigma_1 \geq \sigma_2 \geq \ldots \geq 0$ are the singular values.
\end{theorem}

\begin{definition}[Eigenvalues and Eigenvectors]
\label{def:eigenvalues}
For square matrix $A \in \R^{n \times n}$, scalar $\lambda \in \C$ is an eigenvalue with eigenvector $\mathbf{v} \in \C^n$ if:
\begin{equation}
A\mathbf{v} = \lambda\mathbf{v}, \quad \mathbf{v} \neq \mathbf{0}
\end{equation}
\end{definition}

\begin{theorem}[Spectral Theorem for Symmetric Matrices]
\label{thm:spectral_theorem}
For symmetric matrix $A \in \R^{n \times n}$, there exists orthogonal matrix $Q$ and diagonal matrix $\Lambda$ such that:
\begin{equation}
A = Q\Lambda Q^T
\end{equation}
where $\Lambda = \text{diag}(\lambda_1, \ldots, \lambda_n)$ contains the eigenvalues.
\end{theorem}

\subsection{Matrix Calculus}

\begin{definition}[Matrix Derivatives]
\label{def:matrix_derivatives}
For scalar function $f: \R^{m \times n} \rightarrow \R$:
\begin{equation}
\frac{\partial f}{\partial A} = \begin{bmatrix}
\frac{\partial f}{\partial A_{11}} & \cdots & \frac{\partial f}{\partial A_{1n}} \\
\vdots & \ddots & \vdots \\
\frac{\partial f}{\partial A_{m1}} & \cdots & \frac{\partial f}{\partial A_{mn}}
\end{bmatrix}
\end{equation}
\end{definition}

\begin{lemma}[Common Matrix Derivatives]
\label{lem:matrix_derivatives}
\begin{align}
\frac{\partial}{\partial A} \text{tr}(A) &= I \\
\frac{\partial}{\partial A} \text{tr}(AB) &= B^T \\
\frac{\partial}{\partial A} \text{tr}(A^TBA) &= B^TA + BA \\
\frac{\partial}{\partial A} \log \det(A) &= (A^T)^{-1}
\end{align}
\end{lemma}

\section{Convex Analysis}

\subsection{Convex Sets}

\begin{definition}[Convex Set]
\label{def:convex_set}
A set $C \subseteq \R^n$ is convex if for all $\mathbf{x}, \mathbf{y} \in C$ and $\lambda \in [0,1]$:
\begin{equation}
\lambda \mathbf{x} + (1-\lambda) \mathbf{y} \in C
\end{equation}
\end{definition}

\begin{definition}[Convex Hull]
\label{def:convex_hull}
The convex hull of set $S$ is the smallest convex set containing $S$:
\begin{equation}
\text{conv}(S) = \left\{\sum_{i=1}^k \lambda_i \mathbf{x}_i : k \geq 1, \mathbf{x}_i \in S, \lambda_i \geq 0, \sum_{i=1}^k \lambda_i = 1\right\}
\end{equation}
\end{definition}

\begin{definition}[Extreme Points]
\label{def:extreme_points}
Point $\mathbf{x} \in C$ is an extreme point of convex set $C$ if it cannot be expressed as a proper convex combination of other points in $C$.
\end{definition}

\begin{theorem}[Krein-Milman Theorem]
\label{thm:krein_milman}
Every compact convex set in finite-dimensional space is the convex hull of its extreme points.
\end{theorem}

\subsection{Convex Functions}

\begin{definition}[Convex Function]
\label{def:convex_function}
Function $f: \R^n \rightarrow \R$ is convex if its domain is convex and for all $\mathbf{x}, \mathbf{y}$ in the domain and $\lambda \in [0,1]$:
\begin{equation}
f(\lambda \mathbf{x} + (1-\lambda) \mathbf{y}) \leq \lambda f(\mathbf{x}) + (1-\lambda) f(\mathbf{y})
\end{equation}
\end{definition}

\begin{theorem}[First-Order Characterization]
\label{thm:convex_first_order}
Differentiable function $f$ is convex iff for all $\mathbf{x}, \mathbf{y}$ in the domain:
\begin{equation}
f(\mathbf{y}) \geq f(\mathbf{x}) + \nabla f(\mathbf{x})^T(\mathbf{y} - \mathbf{x})
\end{equation}
\end{theorem}

\begin{theorem}[Second-Order Characterization]
\label{thm:convex_second_order}
Twice differentiable function $f$ is convex iff its Hessian $\nabla^2 f(\mathbf{x}) \succeq 0$ for all $\mathbf{x}$ in the domain.
\end{theorem}

\begin{definition}[Strong Convexity]
\label{def:strong_convexity}
Function $f$ is $\mu$-strongly convex if for all $\mathbf{x}, \mathbf{y}$ and $\lambda \in [0,1]$:
\begin{equation}
f(\lambda \mathbf{x} + (1-\lambda) \mathbf{y}) \leq \lambda f(\mathbf{x}) + (1-\lambda) f(\mathbf{y}) - \frac{\mu}{2}\lambda(1-\lambda)\|\mathbf{x} - \mathbf{y}\|^2
\end{equation}
\end{definition}

\section{Information Theory Basics}

\subsection{Entropy and Mutual Information}

\begin{definition}[Shannon Entropy]
\label{def:shannon_entropy}
For discrete random variable $X$ with probability mass function $p(x)$:
\begin{equation}
H(X) = -\sum_{x} p(x) \log p(x)
\end{equation}
For continuous random variable with density $f(x)$ (differential entropy):
\begin{equation}
h(X) = -\int f(x) \log f(x) \, dx
\end{equation}
\end{definition}

\begin{definition}[Conditional Entropy]
\label{def:conditional_entropy}
\begin{equation}
H(X | Y) = \sum_{y} p(y) H(X | Y = y) = -\sum_{x,y} p(x,y) \log p(x|y)
\end{equation}
\end{definition}

\begin{definition}[Mutual Information]
\label{def:mutual_information}
\begin{equation}
I(X; Y) = H(X) - H(X | Y) = \sum_{x,y} p(x,y) \log \frac{p(x,y)}{p(x)p(y)}
\end{equation}
\end{definition}

\begin{theorem}[Chain Rule for Entropy]
\label{thm:entropy_chain_rule}
\begin{equation}
H(X_1, \ldots, X_n) = \sum_{i=1}^n H(X_i | X_1, \ldots, X_{i-1})
\end{equation}
\end{theorem}

\begin{theorem}[Data Processing Inequality]
\label{thm:data_processing}
If $X \rightarrow Y \rightarrow Z$ forms a Markov chain, then:
\begin{equation}
I(X; Z) \leq I(X; Y) \quad \text{and} \quad I(X; Z) \leq I(Y; Z)
\end{equation}
\end{theorem}

\subsection{Divergences and Distances}

\begin{definition}[Kullback-Leibler Divergence]
\label{def:kl_divergence}
For probability distributions $P$ and $Q$:
\begin{equation}
D_{KL}(P \| Q) = \sum_{x} p(x) \log \frac{p(x)}{q(x)} = \E_P\left[\log \frac{p(X)}{q(X)}\right]
\end{equation}
\end{definition}

\begin{theorem}[Gibbs' Inequality]
\label{thm:gibbs_inequality}
For probability distributions $P$ and $Q$:
\begin{equation}
D_{KL}(P \| Q) \geq 0
\end{equation}
with equality iff $P = Q$.
\end{theorem}

\begin{definition}[Jensen-Shannon Divergence]
\label{def:js_divergence}
\begin{equation}
D_{JS}(P \| Q) = \frac{1}{2}D_{KL}(P \| M) + \frac{1}{2}D_{KL}(Q \| M)
\end{equation}
where $M = \frac{1}{2}(P + Q)$ is the average distribution.
\end{definition}

\section{Graph Theory Fundamentals}

\subsection{Basic Definitions}

\begin{definition}[Graph]
\label{def:graph}
A graph $G = (V, E)$ consists of:
\begin{itemize}
    \item Vertex set $V = \{v_1, \ldots, v_n\}$
    \item Edge set $E \subseteq V \times V$ (for directed graphs) or $E \subseteq \binom{V}{2}$ (for undirected graphs)
\end{itemize}
\end{definition}

\begin{definition}[Graph Properties]
\label{def:graph_properties}
For graph $G = (V, E)$:
\begin{itemize}
    \item Degree of vertex $v$: $\deg(v) = |\{u \in V : (v,u) \in E\}|$
    \item Path: sequence of vertices $v_1, v_2, \ldots, v_k$ with $(v_i, v_{i+1}) \in E$
    \item Cycle: path with $v_1 = v_k$
    \item Connected: path exists between every pair of vertices
    \item Tree: connected acyclic graph
\end{itemize}
\end{definition}

\subsection{Graph Algorithms}

\begin{theorem}[Dijkstra's Algorithm Correctness]
\label{thm:dijkstra}
Dijkstra's algorithm correctly computes shortest paths from source vertex to all other vertices in graphs with non-negative edge weights in $O((|V| + |E|) \log |V|)$ time.
\end{theorem}

\begin{theorem}[Ford-Fulkerson Max Flow]
\label{thm:max_flow}
The maximum flow from source to sink equals the minimum capacity of any cut separating source and sink.
\end{theorem}

\subsection{Centrality Measures}

\begin{definition}[Betweenness Centrality]
\label{def:betweenness_centrality}
For vertex $v$:
\begin{equation}
C_B(v) = \sum_{s \neq v \neq t} \frac{\sigma_{st}(v)}{\sigma_{st}}
\end{equation}
where $\sigma_{st}$ is the number of shortest paths from $s$ to $t$, and $\sigma_{st}(v)$ is the number passing through $v$.
\end{definition}

\begin{definition}[PageRank]
\label{def:pagerank}
The PageRank vector $\mathbf{r}$ satisfies:
\begin{equation}
\mathbf{r} = d \mathbf{A}^T \mathbf{D}^{-1} \mathbf{r} + \frac{1-d}{n} \mathbf{1}
\end{equation}
where $\mathbf{A}$ is the adjacency matrix, $\mathbf{D}$ is the degree matrix, $d \in [0,1)$ is the damping parameter, and $n = |V|$.
\end{definition}

\begin{theorem}[PageRank Convergence]
\label{thm:pagerank_convergence}
For $d \in [0,1)$, the PageRank iteration converges to a unique stationary distribution that is the principal eigenvector of the transition matrix.
\end{theorem}

\section{Asymptotic Notation}

\begin{definition}[Big-O Notation]
\label{def:big_o}
For functions $f, g: \N \rightarrow \R^+$:
\begin{align}
f(n) = O(g(n)) &\iff \exists c > 0, n_0 \in \N : \forall n \geq n_0, f(n) \leq c \cdot g(n) \\
f(n) = \Omega(g(n)) &\iff \exists c > 0, n_0 \in \N : \forall n \geq n_0, f(n) \geq c \cdot g(n) \\
f(n) = \Theta(g(n)) &\iff f(n) = O(g(n)) \text{ and } f(n) = \Omega(g(n))
\end{align}
\end{definition}

This mathematical foundation provides the necessary background for understanding the theoretical developments throughout the book. The concepts presented here are used extensively in analyzing the performance, complexity, and correctness of algorithms in \ClaudeCode{} systems.